{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text classification and sentiment analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once text data has been converted into numerical features using the natural language processing techniques discussed in the previous sections, text classification works just like any other classification task.\n",
    "\n",
    "In this notebook, we will apply these preprocessing technique to news articles, product reviews, and Twitter data and teach various classifiers to predict discrete news categories, review scores, and sentiment polarity."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-26T06:37:13.006374Z",
     "start_time": "2018-11-26T06:37:12.515786Z"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import warnings\n",
    "from collections import Counter, OrderedDict\n",
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas.io.json import json_normalize\n",
    "import pyarrow as pa   \n",
    "import pyarrow.parquet as pq\n",
    "from fastparquet import ParquetFile \n",
    "from scipy import sparse\n",
    "from scipy.spatial.distance import pdist, squareform\n",
    "\n",
    "# Visualization\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import FuncFormatter, ScalarFormatter\n",
    "import seaborn as sns\n",
    "\n",
    "# spacy, textblob and nltk for language processing\n",
    "from textblob import TextBlob, Word\n",
    "\n",
    "# sklearn for feature extraction & modeling\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix\n",
    "from sklearn.externals import joblib\n",
    "\n",
    "import lightgbm as lgb\n",
    "\n",
    "import json\n",
    "from time import clock, time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-26T06:37:13.010613Z",
     "start_time": "2018-11-26T06:37:13.007802Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.style.use('fivethirtyeight')\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## News article classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We start with an illustration of the Naive Bayes model for news article classification using the BBC articles that we read as before to obtain a DataFrame with 2,225 articles from 5 categories."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read BBC articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:12.744002Z",
     "start_time": "2018-11-22T18:43:12.665470Z"
    }
   },
   "outputs": [],
   "source": [
    "path = Path('data', 'bbc')\n",
    "files = path.glob('**/*.txt')\n",
    "doc_list = []\n",
    "for i, file in enumerate(files):\n",
    "    topic = file.parts[-2]\n",
    "    article = file.read_text(encoding='latin1').split('\\n')\n",
    "    heading = article[0].strip()\n",
    "    body = ' '.join([l.strip() for l in article[1:]])\n",
    "    doc_list.append([topic, heading, body])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:13.064527Z",
     "start_time": "2018-11-22T18:43:13.048580Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2225 entries, 0 to 2224\n",
      "Data columns (total 3 columns):\n",
      "topic      2225 non-null object\n",
      "heading    2225 non-null object\n",
      "body       2225 non-null object\n",
      "dtypes: object(3)\n",
      "memory usage: 52.2+ KB\n"
     ]
    }
   ],
   "source": [
    "docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])\n",
    "docs.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create stratified train-test split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We split the data into the default 75:25 train-test sets, ensuring that the test set classes closely mirror the train set:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:29.481585Z",
     "start_time": "2018-11-22T18:43:29.476360Z"
    }
   },
   "outputs": [],
   "source": [
    "y = pd.factorize(docs.topic)[0]\n",
    "X = docs.body\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vectorize text data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We proceed to learn the vocabulary from the training set and transforming both dataset using the CountVectorizer with default settings to obtain almost 26,000 features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:39.761327Z",
     "start_time": "2018-11-22T18:43:39.332132Z"
    }
   },
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer()\n",
    "X_train_dtm = vectorizer.fit_transform(X_train)\n",
    "X_test_dtm = vectorizer.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:42.048891Z",
     "start_time": "2018-11-22T18:43:42.042059Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1668, 25919), (557, 25919))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_dtm.shape, X_test_dtm.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Multi-class Naive Bayes model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:50.244608Z",
     "start_time": "2018-11-22T18:43:50.223308Z"
    }
   },
   "outputs": [],
   "source": [
    "nb = MultinomialNB()\n",
    "nb.fit(X_train_dtm, y_train)\n",
    "y_pred_class = nb.predict(X_test_dtm)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate Results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We evaluate the multiclass predictions using accuracy to find the default classifier achieved almost 98%:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:43:57.224720Z",
     "start_time": "2018-11-22T18:43:57.208935Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9766606822262118"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy_score(y_test, y_pred_class)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Confusion matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T18:44:00.292728Z",
     "start_time": "2018-11-22T18:44:00.268874Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>98</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>128</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>102</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>121</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0    1    2    3   4\n",
       "0  98    0    0    2   0\n",
       "1   0  128    0    0   0\n",
       "2   0    0  102    2   0\n",
       "3   2    0    5  121   0\n",
       "4   0    0    1    1  95"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred_class))"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "316px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
